/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.fetcher; import net.nutch.net.protocols.Response; import java.net.InetAddress; import java.net.URL; import java.util.Date; import java.text.DateFormat; import net.nutch.pagedb.FetchListEntry; import net.nutch.net.protocols.http.Http; import net.nutch.net.protocols.http.HttpResponse; import net.nutch.util.LogFormatter; import net.nutch.util.StringUtil; import java.util.logging.Logger; import java.util.logging.Level; import java.util.logging.Handler; /** * <code>RequestRecord<code>s represent a URL's state in the system, * either encapsulating a {@link FetchListEntry} or representing a * <code>robots.txt<code> request and what it's status is. * * <p> * * RequestRecords are passed around between {@link HostQueue}s, the * {@link RequestScheduler}, {@link FetcherThread}s, and {@link * OutputThread}s. They count retries and redirects, track * redirect chains, remember failure reasons, and hold the content * from successful fetches. * */ public class RequestRecord { public static final Logger LOG= LogFormatter.getLogger("net.nutch.fetcher.RequestRecord"); private URL url; private String urlString; // previous request (set if this RequestRecord is the result of a redirect) private RequestRecord redirectedFrom; // the FetchListEntry associated with the request (set if this is not a // robots.txt request) private FetchListEntry fle; // the HostQueue that this request belongs to private HostQueue hostQueue; // whether the HostQueue wants to know when this request finishes private boolean hostQueueWantsNotification; // how many errors we've encountered fetching this URL private int numErrors; // how many redirects we've encountered fetching this URL private int redirects; // after a failure, this encodes what went wrong private int failureReason; // after a failure, this contains strings that should be logged // to clarify the problem private String[] failureMessages; // after an output failure, this encodes what went wrong private int outputStatus; // after an output failure, this contains strings that should be logged // to clarify the problem private String[] outputStatusMessages; // after an error, this encodes what went wrong private int errorReason; // after an error, this contains strings that should be logged // to clarify the problem (usually just the URL string) private String[] errorMessages; // whether or not this request has failed private boolean hasFailed; // the HTTP version code; what version should we try this request with? // fetcher may modify this value to tell HostQueue to fall back private int httpVersion; // after a successful fetch, this points to the Http.Response we got private Response response; // after a successful fetch, this holds the date from the "Expires" header private Date expireTime; // bandwidth consumed for this request private long bytesTransmitted; private long bytesReceived; private InetAddress addr; /** * Creates a new <code>RequestRecord</code>, which encapsulates the * given {@link FetchListEntry} and {@link URL}. If the * <code>hostQueue</code> field is filled in, the RequestRecord is * associated with the given <code>HostQueue</code>. */ RequestRecord(URL url, FetchListEntry fle, HostQueue hostQueue) { this.redirectedFrom= null; this.url= url; this.fle= fle; this.hostQueue= hostQueue; this.hostQueueWantsNotification= false; this.numErrors= 0; this.redirects= 0; this.response= null; this.failureReason= 0; this.hasFailed= false; this.httpVersion= Http.HTTP_VER_NOTSET; this.bytesTransmitted= 0; this.bytesReceived= 0; this.addr= null; } /** * Creates a new <code>RequestRecord</code>, as a redirect from the * supplied <code>RequestRecord</code>. The new target is the * supplied <code>URL</code>. If the <code>hostQueue</code> * field is filled in, the RequestRecord is associated with the * given <code>HostQueue</code>. The number of bytes transferred * is initialized from the parent request. */ RequestRecord(RequestRecord redirectedFrom, URL redirectedTo, HostQueue hostQueue) { this.redirectedFrom= redirectedFrom; this.url= redirectedTo; this.fle= redirectedFrom.getFetchListEntry(); this.hostQueue= hostQueue; this.hostQueueWantsNotification= false; this.numErrors= redirectedFrom.getNumErrors(); this.redirects= redirectedFrom.getNumRedirects(); this.response= null; this.failureReason= 0; this.hasFailed= false; this.httpVersion= Http.HTTP_VER_NOTSET; this.bytesTransmitted= redirectedFrom.bytesTransmitted; this.bytesReceived= redirectedFrom.bytesReceived; this.addr= null; } /** * Creates a new <code>RequestRecord</code>, which encapsulates the * given {@link FetchListEntry} and sets it's failure flag to * <code>hasFailed</code>. The RequestRecord created this way is * suitable for handoff to an {@link OutputThread}, but not a {@link * FetcherThread}, since the <code>URL</code> field is not filled * in. Requests may be created this way for * <code>FetchListEntries</code> which are not scheduled for * fetching (see FetchListEntry.getFetch()), or those that are * filtered for other reasons. */ RequestRecord(URL url, FetchListEntry fle, boolean hasFailed) { this.redirectedFrom= null; this.url= url; this.fle= fle; this.hostQueue= null; this.hostQueueWantsNotification= false; this.numErrors= 0; this.redirects= 0; this.response= null; this.failureReason= 0; this.hasFailed= hasFailed; this.httpVersion= Http.HTTP_VER_NOTSET; this.bytesTransmitted= 0; this.bytesReceived= 0; this.addr= null; } /** * Creates a new <code>RequestRecord</code>, which encapsulates the * given {@link FetchListEntry} and sets it's failure flag to * <code>hasFailed</code>. A RequestRecord created this way is * suitable for handoff to an {@link OutputThread}, but not a {@link * FetcherThread}, since the <code>URL</code> field is not filled * in. Requests may be created this way for * <code>FetchListEntries</code> which are not scheduled for * fetching (see FetchListEntry.getFetch()), or those that are * filtered for other reasons. */ RequestRecord(FetchListEntry fle, boolean hasFailed) { this(null, fle, hasFailed); } /** * Increment the error count for this RequestRecord */ public void incrementErrors() { numErrors++; } /** * Returns the current error count for this RequestRecord */ public int getNumErrors() { return numErrors; } /** * Increment the redirect count for this RequestRecord */ public void incrementRedirects() { redirects++; } /** * Returns the current redirect count for this RequestRecord */ public int getNumRedirects() { return redirects; } /** * Sets the response for this RequestRecord */ public void setResponse(Response response) { this.response= response; if (response != null) { expireTime= null; String expireStr= response.getHeader("Expires"); if (expireStr != null) { try { DateFormat df= DateFormat.getDateInstance(DateFormat.LONG); Date date= df.parse(expireStr); expireTime= date; } catch (Exception e) { ; } } } } /** * Returns the response for this RequestRecord */ public Response getResponse() { return response; } /** * Returns the {@link FetchListEntry} associated with this * <code>RequestRecord</code>, or <code>null</code> if this * is a <code>robots.txt</code> request. */ public FetchListEntry getFetchListEntry() { return fle; } /** * Returns <code>true</code> if this is a <code>robots.txt</code> request, or * <code>false</code> otherwise. */ public boolean isRobotsRequest() { return fle == null; } /** * Returns the {@link URL} associated with this RequestRecord (which * may be down a redirect path from the original request). */ public URL getURL() { return url; } /** * Returns the {@link URL} associated with this RequestRecord (which * may be down a redirect path from the original request), as a * <code>String</code>. */ public String getURLString() { if (urlString == null) { if (url == null) urlString= "null"; else urlString= url.toString(); } return urlString; } /** * Returns the original {@link URL} associated with this * RequestRecord (ie the beginning of any redirect path that has * been followed). */ public URL getOriginalURL() { RequestRecord tmp= this; while (tmp.redirectedFrom != null) { tmp= tmp.redirectedFrom; } return tmp.url; } /** * Returns the HTTP version code associated with this RequestRecord. * HostQueues should set this value to determine the protocol used * for a fetch attempt, and FetcherThreads may reset it to tell the * HostQueue to use a different version. */ public int getHttpVersion() { return httpVersion; } /** * Sets the HTTP version code associated with this RequestRecord. * HostQueues should set this value to determine the protocol used * for a fetch attempt, and FetcherThreads may reset it to tell the * HostQueue to use a different version. */ public void setHttpVersion(int httpVersion) { this.httpVersion= httpVersion; } /** * Gets the {@link HostQueue} that is responsible for queueing this * request. */ public HostQueue getHostQueue() { return hostQueue; } /** * Sets the {@link HostQueue} that is responsible for queueing this * request. The current <code>HostQueue<code> must be * <code>null</code>; it may not be re-set. * * @throws IllegalStateException if the <code>hostQueue</code> is already set */ public void setHostQueue(HostQueue hostQueue) { if (this.hostQueue != null) throw new IllegalStateException("Can't reset hostQueue!"); this.hostQueue= hostQueue; } /** * Sets the <code>hasFailed</code> code. */ public void setHasFailed(boolean hasFailed) { this.hasFailed= hasFailed; } /** * Returns the <code>hasFailed</code> code. */ public boolean getHasFailed() { return hasFailed; } /** * Sets the <code>FailureReason</code> to <code>code</code>. The * failure reason explains why we didn't to get a page (redirect * loop detected, host is dead, etc), but not necessarily what * specific errors were encountered (refused connection, read * timeout, etc). */ public void setFailureReason(int reasonCode) { this.failureReason= reasonCode; } /** * Returns the <code>FailureReason</code> code. */ public int getFailureReason() { return failureReason; } /** * Associates additional <code>Strings</code> (which should help explain * the problem) with the last <code>failureReason</code> that was set. */ public void setFailureMessages(String[] msgs) { failureMessages= msgs; } /** * Returns additional <code>Strings</code> which may help explain * the failure, or <code>null</code> if no such messages have been * set. */ public String[] getFailureMessages() { return failureMessages; } /** * Sets the <code>ErrorReason</code> to <code>code</code>. The * error reason explains why we didn't get the page on this attempt. * When this method is called, the current * <code>errorMessages</code> is reset (see {@link * #setErrorMessages(String[])}). */ public void setErrorReason(int reasonCode) { this.errorReason= reasonCode; this.errorMessages= null; } /** * Returns the <code>ErrorReason</code> code. */ public int getErrorReason() { return errorReason; } /** * Associates additional <code>Strings</code> (which should help explain * the problem) with the last errorReason that was set. */ public void setErrorMessages(String[] msgs) { errorMessages= msgs; } /** * Returns additional <code>Strings</code> which may help explain * the error, or <code>null</code> if no such messages have been * set. */ public String[] getErrorMessages() { return errorMessages; } /** * Sets the <code>outputStatus</code> to <code>status</code>. When * this method is called, the current * <code>outputStatusMessages</code> are reset (see {@link * #setOutputStatusMessages(String[])}). */ public void setOutputStatus(int status) { this.outputStatus= status; this.outputStatusMessages= null; } /** * Returns the <code>outputStatus</code> code. */ public int getOutputStatus() { return outputStatus; } /** * Associates additional <code>Strings</code> (which should help explain * the status) with the last <code>outputStatus</code> that was set. */ public void setOutputStatusMessages(String[] msgs) { outputStatusMessages= msgs; } /** * Returns additional <code>Strings</code> which may help explain * the outputStatus, or <code>null</code> if no such messages have * been set. */ public String[] getOutputStatusMessages() { return outputStatusMessages; } /** * Returns a {@link Date} Object which represents the HTTP * <code>Expire</code> time associated with the {@link * net.nutch.net.Http.Response} ({see @link #getResponse()}. The * expire time is set from the response, when {@link * #setResponse(Http.Response)} is called. */ public Date getExpireTime() { return expireTime; } /** * Sets the cached InetAddress for the host this request belongs to. */ public void setAddr(InetAddress addr) { this.addr= addr; } /** * Gets the cached InetAddress for the host this request belongs to. * This returns <code>null</code> if it has not been set (or has * previously been set to <code>null</code>). */ public InetAddress getAddr() { return addr; } /** * Returns the immediate parent request of this * <code>RequestRecord</code>, or <code>null</code> if this request * is not the result of a redirect. */ public RequestRecord getParentRequest() { return redirectedFrom; } /** * Returns the original RequestRecord that lead to this request (via * redirects). If this request was not triggered by redirects, * <code>this</code> will be returned. */ public RequestRecord getOriginalRequest() { RequestRecord tmp= this; while (tmp.redirectedFrom != null) tmp= tmp.redirectedFrom; return tmp; } /** * If set to <code>true</code>, the {@link HostQueue} associated * with this RequestRecord will be notified when the request is * finished. This defaults to <code>false</code> when a * RequestRecord is first created. */ public void setNotifyQueue(boolean notify) { this.hostQueueWantsNotification= notify; } /** * Returns <code>true</code> if the {@link HostQueue} associated * with this RequestRecord will be notified when the request is * finished. This defaults to <code>false</code> when a * RequestRecord is first created. */ public boolean getNotifyQueue() { return hostQueueWantsNotification; } /** * Propagates results (<code>hasFailed</code>, * <code>response</code>, <code>failureReason</code>, and * <code>expireTime</code>) to any parent requests. */ private void updateParent() { if (LOG.isLoggable(Level.FINER)) { LOG.finer("updating parent of " + url); } if (redirectedFrom != null) { redirectedFrom.hasFailed= hasFailed; redirectedFrom.response= response; redirectedFrom.failureReason= failureReason; redirectedFrom.expireTime= expireTime; } } /** * Calls {@link HostQueue#requestCompleted(RequestRecord)} on all * {@link HostQueue} Objects that are waiting for notification on this * request or any parent requests. */ public void notifyQueuesOfCompletion() { updateParent(); if (hostQueueWantsNotification) { if (LOG.isLoggable(Level.FINER)) { LOG.finer("notifying " + hostQueue.getKey() + " about " + url); LOG.finer(" orig url was " + getOriginalURL() ); } hostQueue.requestCompleted(this); } if (redirectedFrom != null) redirectedFrom.notifyQueuesOfCompletion(); if (LOG.isLoggable(Level.FINE)) LOG.fine("notified queues about " + url); } }